package com.wzxy.parse.tbw;

import java.io.IOException;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.wzxy.spider.tbw.entity.TbwNews;
import com.wzxy.spider.tbw.service.impl.TbwService;
import com.wzxy.util.MyTimeUtils;
import com.wzxy.util.RegexUtils;
import com.wzxy.wzinfo.entiry.NewsIntroduce;
import com.wzxy.wzinfo.service.impl.NewsIntService;


/**
 * 解析原文
 * @author h
 * http://tengbei.net/
 */
@Service
public class TbwOriginArticle{
	
	@Autowired
	private TbwService tbwService;
	@Autowired
	private NewsIntService newsIntService;
	
	public void startParse(TbwNews tbwNews) throws IOException{
		
		if(StringUtils.isNotBlank(tbwNews.getHtml())){
			Document doc = Jsoup.parse(tbwNews.getHtml());
			NewsIntroduce introduce = new NewsIntroduce();
			
			introduce.setURL(tbwNews.getUrl());
			
			String parseTime = MyTimeUtils.getNowDateString();
			introduce.setCreationTime(parseTime);
			System.out.println("解析时间：" + parseTime);
			
			String title = tbwNews.getTitle();	
			if(StringUtils.isBlank(title)){
				title = doc.select("div.section > h2.text-center").text();
			}
			introduce.setTitle(title);
			System.out.println("文章的标题：" + title);
			
			String readNum = doc.select("p.text-center > span.text-error").eq(0).text();
			System.out.println("阅读数：" + readNum);
			
			String commentNum = doc.select("p.text-center > span.text-error").eq(1).text();
			System.out.println("评论数：" + commentNum);
			
			String from = doc.select("p.text-center > strong").text();
			System.out.println("来源：" + from);
	
			/** 正则表达式获取时间 */
			String postDate = doc.select("p.text-center").text();
			introduce.setPostTime(RegexUtils.getFormatDate(postDate));
			System.out.println("发布日期：" + RegexUtils.getFormatDate(postDate));
			
			String content = "";
			Element contentEle = doc.select("div.newscontent").first();	
		
			if(StringUtils.isNotBlank(doc.select("[id=js_content]").text())){
				content = doc.select("[id=js_content]").text();
			}else if(StringUtils.isNotBlank(contentEle.ownText())){
				content = contentEle.ownText();
			}else{	
				/** 去除上下篇标题 */
				doc.select("div.newscontent > ul").first().remove();
				/** 去除扫一扫分享到微信或ＱＱ */
				doc.select("div.newscontent > a").first().remove();
				content = doc.select("div.newscontent").text();
			}
			introduce.setContent(content);
			System.out.println("文章内容：" + content);
	
			/** 评论(没有解析嵌套评论) */
			Elements comments = doc.select("[id=cmdlist]").select("li");
			for (Element comment : comments) {
				Element cContentEle =  comment.select("span.message").first();
				if(cContentEle != null){
				System.out.println("地区："+comment.select("span.header > span.from").text());
				System.out.println("评论内容："+cContentEle.ownText());
				System.out.println("时间："+comment.select("span.header > span.time").text());
				}
			}
			newsIntService.save(introduce);
			tbwNews.setStatus(com.wzxy.common.Const.DONE);
			tbwService.update(tbwNews, tbwNews.getId());
			
		}
		

	}
	
	public static void main(String[] args) {
//		try {
//			new TbwOriginArticle().start();
//		} catch (IOException e) {
//			e.printStackTrace();
//		}
	}
	
}
